# download_jins_issue.py
# JINS (Journal on Interactive Systems) Downloader
# -------------------------------------------------
# Automates downloading PDFs from JINS OJS issue pages
# - Parses article titles and links from issue pages
# - Follows each article's "view" page to extract the direct PDF link from <meta name="citation_pdf_url">
# - Handles both absolute and relative URLs using urljoin
# - Creates dynamic folder names like JINS_Vol11_Issue1_2020 from issue <title> tag
# - Saves each PDF with sanitized filenames for cross-platform compatibility
# - Logs all downloads to CSV
# - Reusable for other OJS journals with similar PDF meta tag structures
# -------------------------------------------------
# - Paste an issue URL, e.g.:
#   https://journals-sol.sbc.org.br/index.php/jis/issue/view/volume11
# - Parses article titles from the issue page (h3.title a[href])
# - Follows each article page and extracts the real PDF from:
#       <meta name="citation_pdf_url" content=".../article/download/.../..." />
# - Saves PDFs to: JINS_Vol{vol}_Issue{iss}_{year}
# - Logs results to CSV and prints a quick [INFO] count

import re
import csv
import time
from pathlib import Path
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

BASE_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    )
}
TIMEOUT = 60
PAUSE_SEC = 0.4  # polite delay between downloads

def sanitize_filename(name: str) -> str:
    name = re.sub(r'[\\/*?:"<>|]', "", name)
    name = re.sub(r"\s+", " ", name).strip()
    name = re.sub(r"\.+", ".", name).strip(". ")
    return name[:180]

def get(url: str, referer: str | None = None) -> requests.Response:
    headers = dict(BASE_HEADERS)
    if referer:
        headers["Referer"] = referer
    r = requests.get(url, headers=headers, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def soup_for(url: str, referer: str | None = None) -> BeautifulSoup:
    r = get(url, referer=referer)
    return BeautifulSoup(r.text, "html.parser")

def ensure_pdf(resp: requests.Response) -> bool:
    ctype = (resp.headers.get("Content-Type") or "").lower()
    return ("pdf" in ctype) or (resp.content[:5] == b"%PDF-")

def parse_folder(issue_soup: BeautifulSoup) -> tuple[str, str, str]:
    """
    Extract vol/issue/year from page <title>, e.g.:
      'Vol. 11 No. 1 (2020) | Journal on Interactive Systems'
    """
    title_text = issue_soup.title.get_text(" ", strip=True) if issue_soup.title else ""
    # Robust regex: Vol. 11 No. 1 (2020)
    m = re.search(r"Vol\.?\s*(\d+)\s+No\.?\s*(\d+)\s*\((19|20)\d{2}\)", title_text, re.I)
    if m:
        vol, iss, yr = m.group(1), m.group(2), m.group(0).split("(")[-1].rstrip(")")
        # But safer: use groups explicitly
        vol, iss, yr = m.group(1), m.group(2), re.search(r"(19|20)\d{2}", m.group(0)).group(0)
    else:
        # Fallback: try scanning whole document text
        full = issue_soup.get_text(" ", strip=True)
        m2 = re.search(r"Vol\.?\s*(\d+)\s+No\.?\s*(\d+)\s*\((19|20)\d{2}\)", full, re.I)
        if m2:
            vol, iss, yr = m2.group(1), m2.group(2), re.search(r"(19|20)\d{2}", m2.group(0)).group(0)
        else:
            vol, iss, yr = "XX", "YY", "Year"
    return vol, iss, yr

def collect_issue_articles(issue_url: str, issue_soup: BeautifulSoup) -> list[dict]:
    """
    From the issue page:
      - title: h3.title a[href]
      - article page URL: anchor href (absolute already; `urljoin` for safety)
    """
    items = []
    for a in issue_soup.select("h3.title a[href]"):
        href = (a.get("href") or "").strip()
        title = a.get_text(" ", strip=True)
        if not href:
            continue
        art_url = urljoin(issue_url, href)
        items.append({"title": title, "article_url": art_url})
    return items

def extract_pdf_url_from_article(article_url: str) -> str | None:
    """
    Article page contains:
      <meta name="citation_pdf_url" content=".../article/download/{id}/{fileId}">
    Fallbacks:
      - a.obj_galley_link.pdf
      - any href containing '/article/download/' and ending with '.pdf' or not (OJS serves PDF even without extension)
    """
    art_soup = soup_for(article_url, referer=article_url.rsplit("/", 1)[0])
    # Primary: citation meta
    m = art_soup.find("meta", attrs={"name": "citation_pdf_url", "content": True})
    if m and m["content"]:
        return m["content"].strip()

    # Fallback 1: OJS "PDF" link
    a = art_soup.find("a", class_=re.compile(r"\bobj_galley_link\b.*\bpdf\b", re.I), href=True)
    if a:
        return urljoin(article_url, a["href"].strip())

    # Fallback 2: any /article/download/
    any_a = art_soup.find("a", href=re.compile(r"/article/download/"))
    if any_a:
        return urljoin(article_url, any_a["href"].strip())

    return None

def main():
    issue_url = input("Paste JINS issue URL (e.g., https://journals-sol.sbc.org.br/index.php/jis/issue/view/volume11): ").strip()
    if not issue_url:
        print("No URL provided. Exiting.")
        return

    print("[INFO] Fetching issue page…")
    issue_soup = soup_for(issue_url)

    vol, iss, yr = parse_folder(issue_soup)
    outdir = Path(f"JINS_Vol{vol}_Issue{iss}_{yr}")
    outdir.mkdir(parents=True, exist_ok=True)
    log_csv = outdir / f"JINS_Vol{vol}_Issue{iss}_{yr}_log.csv"

    rows = collect_issue_articles(issue_url, issue_soup)
    print(f"[INFO] Found {len(rows)} items on this issue page")

    saved = 0
    with log_csv.open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["Title", "Article URL", "PDF URL", "Filename", "Status"])

        for i, it in enumerate(rows, 1):
            title = sanitize_filename(it["title"])
            article_url = it["article_url"]

            try:
                pdf_url = extract_pdf_url_from_article(article_url)
                if not pdf_url:
                    w.writerow([title, article_url, "", "", "Skipped (no PDF link found)"])
                    print(f"[{i}] ⚠️ No PDF: {title}")
                    continue

                pdf_resp = get(pdf_url, referer=article_url)
                if not ensure_pdf(pdf_resp):
                    w.writerow([title, article_url, pdf_url, "", "Skipped (not a PDF response)"])
                    print(f"[{i}] ❌ Not a PDF: {title}")
                    continue

                fname = f"{title}.pdf"
                (outdir / fname).write_bytes(pdf_resp.content)
                w.writerow([title, article_url, pdf_url, fname, "OK"])
                print(f"[{i}] ✅ Saved: {fname}")
                saved += 1
                if PAUSE_SEC > 0:
                    time.sleep(PAUSE_SEC)

            except Exception as e:
                w.writerow([title, article_url, "", "", f"Error: {e}"])
                print(f"[{i}] ❌ Error: {e}")

    print(f"\nDone! {saved} PDFs saved in {outdir}")
    print(f"Log: {log_csv}")

if __name__ == "__main__":
    main()
